In [1]:
#no need to refresh kernel when changes are made to the helper scripts
%load_ext autoreload
%autoreload 2
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
import pandas as pd
import numpy as np
import os
import sys
from IPython.display import display,FileLink, Markdown, HTML, Image

cwd = os.path.dirname(os.getcwd()) #add the cwd so that python scripts can be imported.
if cwd not in sys.path:
    sys.path.insert(0, cwd)

#to save in the same directory as the notebook, change to resource_path="".

# print(project_root)
# print(resource_path)
In [4]:
#parameters
gse = "GSE247175"
project_root = cwd
In [5]:
# Parameters
gse = "GSE247175"
project_root = "/home/ajy20/geo2reports"
In [6]:
from pathlib import Path
resource_path = os.path.join(project_root, "output", gse)
Path(resource_path).mkdir(parents=True, exist_ok=True)
In [7]:
import contextlib

@contextlib.contextmanager
def suppress_output(stdout=True, stderr=True, dest='/dev/null'):
    ''' Usage:
    with suppress_output():
        print('hi')
    '''
    dev_null = open(dest, 'a')
    if stdout:
        _stdout = sys.stdout
        sys.stdout = dev_null
    if stderr:
        _stderr = sys.stderr
        sys.stderr = dev_null
    try:
        yield
    finally:
        if stdout:
            sys.stdout = _stdout
        if stderr:
            sys.stderr = _stderr
In [8]:
from Bio import Entrez
from dotenv import load_dotenv

load_dotenv()

os.environ['ENTREZ_EMAIL'] = os.getenv('ENTREZ_EMAIL')

Entrez.email = os.environ['ENTREZ_EMAIL']

id_handle = Entrez.esearch(db="gds", term=f"{gse}[Accession]", retmax=1)
id_record = Entrez.read(id_handle)
gds_id = id_record["IdList"][0]
In [9]:
stream = Entrez.esummary(db='gds', id=gds_id)
record = Entrez.read(stream)
In [10]:
map_species = {
    "homo sapiens": "human",
    "mus musculus": "mouse"
}
In [11]:
species = map_species[record[0]['taxon'].lower()]
In [12]:
if len(record[0]['PubMedIds'])==0: #discard studies with no pubmed citation
    raise ValueError("No PubMed citation found for this study.")
In [13]:
pmid = int(record[0]['PubMedIds'][0])
In [14]:
if len(record[0]['Samples']) not in range(6, 25): #discard studies that dont have 6-24 samples
    raise ValueError("Number of samples need to be within 6-24.")
In [15]:
def fetch_pubmed_metadata(pmid):
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    records = Entrez.read(handle)
    article = records['PubmedArticle'][0]['MedlineCitation']['Article']

    title = article['ArticleTitle']
    journal = article['Journal']['Title']
    journal_abbr = article['Journal']['ISOAbbreviation']
    year = article['Journal']['JournalIssue']['PubDate'].get('Year', '')
    volume = article['Journal']['JournalIssue'].get('Volume', '')
    issue = article['Journal']['JournalIssue'].get('Issue', '')
    pages = article.get('Pagination', {}).get('MedlinePgn', '')
    authors = article.get('AuthorList', [])

    # def format_author(author):
    #     initials = ''.join(author.get('Initials', ''))
    #     return f"{author['LastName']} {initials}"

    def format_author(author):
        initials = '. '.join(author.get('Initials', '')) + '.'
        return f"{author['LastName']}, {initials}"


    authors = [format_author(a) for a in authors]

    return {
        "title": title,
        "journal": journal_abbr,
        "year": year,
        "volume": volume,
        "issue": issue,
        "pages": pages,
        "authors": authors
    }
In [16]:
pmdict = fetch_pubmed_metadata(pmid)
In [17]:
def format_apa_citation(metadata, pmid=None):
    authors = metadata['authors']

    if len(authors) <= 20:
        if len(authors) > 1:
            author_str = ', '.join(authors[:-1]) + ', & ' + authors[-1]
        else:
            author_str = authors[0]
    else:
        author_str = ', '.join(authors[:19]) + ', ... ' + authors[-1]

    citation = (
        f"{author_str} ({metadata['year']}). {metadata['title']}. "
        f"*{metadata['journal']}*, {metadata['volume']}({metadata['issue']}), {metadata['pages']}."
    )

    if metadata.get('doi'):
        citation += f" https://doi.org/{metadata['doi']}"
    elif pmid:
        citation += f" PMID: {pmid}"

    return citation
In [18]:
citation = format_apa_citation(pmdict)
In [19]:
display(Markdown(f"# **Reanalysis of \"{pmdict['title']}\" by {pmdict['authors'][0]} et al., {pmdict['journal']}, {pmdict['year']}**"))

Reanalysis of "Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia." by Jiang, X. et al., iScience, 2024¶

In [20]:
link = f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={gse}"
display(Markdown(f"{citation}"))
HTML(f'<a href="{link}" target="_blank">Visit GEO accession page</a>')

Jiang, X., Huang, K., Sun, X., Li, Y., Hua, L., Liu, F., Huang, R., Du, J., & Zeng, H. (2024). Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia.. iScience, 27(1), 108691.

Out[20]:
Visit GEO accession page
In [21]:
from google import genai

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
#print(os.environ["GOOGLE_API_KEY"])
prompt = f'''
  You are an expert academic writer. Your task is to reformat the provided research information into a concise abstract around 250 words following this exact template:

  "In this study, <FIRST AUTHOR> et al. [1] profiled <CELLS AND CONDITIONS> to further our understanding of <TOPIC>. The reanalysis of this dataset include <FILL IN>"

  Here is the contextual information:
  Author: {pmdict['authors']}
  Title: {pmdict['title']}
  Summary: {record[0]['summary']}

  In the reanalysis explanation, use the following information: the reanalysis is a full RNA-seq analysis pipeline that consists of: UMAP[2], PCA[3], t-SNE[4] plots of the samples; clustergram heatmap; differential gene expression analysis
  for each pair of control and perturbation samples; Enrichment analysis for each gene signature using Enrichr [5, 6, 7]; Transcription factor analysis of gene signatures
  using ChEA3 [8] ; Reverser and mimicker drug match analysis using L2S2 [9] and DRUG-seqr [10], both FDA and non-FDA approved. Results are provided as tables in addition to bar charts.

  Please write the reanalysis as a complete paragraph with smoothly transitioning sentences. Use consistent, present tense.
  Do not omit or change the ordering of the reference numbers.
  Do not change the reference and insert it, in parentheses, where indicated. 

  Now, generate the abstract strictly following the template. Do not include any other text or introductory/concluding remarks.
'''

response = client.models.generate_content(
    model="gemini-2.0-flash", contents=prompt
)
#print(response.text)

Abstract¶

In [22]:
display(Markdown(f"{response.text}\n*This abstract was generated with the assistance of Gemini 2.0 Flash.*"))

In this study, Jiang et al. [1] profiled acute myeloid leukemia (AML) cell lines under treatment with Hexamethylene amiloride (HA) and venetoclax, alone and in combination, to further our understanding of the therapeutic potential of targeting NHE1 in AML. The reanalysis of this dataset includes a comprehensive RNA-seq analysis pipeline consisting of UMAP (Uniform Manifold Approximation and Projection) [2], PCA (Principal Component Analysis) [3], and t-SNE (t-distributed Stochastic Neighbor Embedding) [4] plots for visualizing sample distributions. A clustergram heatmap provides an overview of gene expression patterns across samples. Differential gene expression analysis is performed for each control and perturbation sample pair. Enrichment analysis for each resulting gene signature is conducted using Enrichr [5, 6, 7]. Transcription factor analysis of these gene signatures is performed utilizing ChEA3 [8]. Furthermore, the reanalysis incorporates reverser and mimicker drug match analysis using L2S2 [9] and DRUG-seqr [10], considering both FDA-approved and non-FDA-approved compounds. Results are presented as tables and bar charts.

This abstract was generated with the assistance of Gemini 2.0 Flash.

Methods¶

RNA-seq alignment

Gene count matrices were obtained from ARCHS4 [11], which preprocessed the raw FASTQ data using the Kallisto [12] and STAR [] pseudoalignment algorithm.

Gene matrix processing

The raw gene matrix was filtered to remove genes that do not have an average of 3 reads across the samples. It was then quantile, log2, and z-score normalized. A regex-based function was used to infer whether individual samples belong to a “control” or a “perturbation” group by processing the metadata associated with each sample.

Dimensionality Reduction Visualization

Three types of dimensionality reduction techniques were applied on the processed expression matrices: UMAP[2], PCA[3], and t-SNE[4]. UMAP was calculated by the UMAP Python package and PCA and t-SNE were calculated using the Scikit-Learn Python library. The samples were then represented on 2D scatterplots.

Clustergram Heatmap

As a preliminary step, the top 1000 genes exhibiting most variability were selected. Using this new set, clustergram heatmaps were generated. Two versions of the clustergram exist: an interactive one generated by Clustergrammer [13] and a publication-ready alternative.

Differentially Expressed Genes Calculation and Volcano Plot

Differentially expressed genes between the control and perturbation samples were calculated using Limma Voom [14]. The logFC and -log10p values of each gene were visualized as a volcano scatterplot. Upregulated and downregulated genes were selected according to this criteria: p < 0.05 and |logFC| > 1.0.

Enrichr Enrichment Analysis

The upregulated and down-regulated sets were separately submitted to Enrichr [5, 6, 7]. These sets were compared against libraries from ChEA [8], ARCHS4 [12], Reactome Pathways [15], MGI Mammalian Phenotype [16], Gene Ontology Biological Processes [17], GWAS Catalog [18], KEGG [19, 20, 21], and WikiPathways [22]. The top matched terms from each library and their respective -log10p values were visualized as barplots.

Chea3 Transcription Factor Analysis

The upregulated and down-regulated sets were separately submitted to Chea3 [8]. These sets were compared against the libraries ARCHS4 Coexpression [12], GTEx Coexpression [23], Enrichr [5, 6, 7], ENCODE ChIP-seq [24, 25], ReMap ChIP-seq [26], and Literature-mined ChIP-seq. The top matched TFs were ranked according to their average score across each library and represented as barplots.

L2S2 and Drug-seqr drug analysis

The top 500 up and downregulated sets were submitted simulataneously to identify reverser and mimicker molecules, both FDA and non-FDA approved, from the L2S2 [9] and Drug-seqr [10] databases. The top matched molecules were compiled into tables and visualized as barplots.

In [23]:
%%capture

import json
from datetime import datetime

#write a json file as a catalog list.
metadata_path = Path(os.path.join(project_root, "public", "metadata.json"))

# 1. Load existing metadata (or create empty if file doesn't exist)
if metadata_path.exists():
    with open(metadata_path, "r") as f:
        metadata = json.load(f)
else:
    metadata = {}


entry = {
    "GSE": gse, 
    "author": ", ".join(pmdict['authors']),
    "year": pmdict['year'],
    "species": species,
    "title": pmdict['title'],
    "pmid": pmid,
    "num_samps": len(record[0]['Samples']),
    "samples": ", ".join(sorted([w['Accession'] for w in record[0]['Samples']])),
    "citation": citation,
    "notebook_path": f"{resource_path}/{gse}.ipynb",
    #"report_path": f"{resource_path}/{gse}.html",
    "timestamp": datetime.now().isoformat()
}

# 3. Add entry only if it doesn't already exist
if gse not in metadata:
    metadata[gse] = entry
    print(f"[INFO] Added metadata for {gse}")
else:
    print(f"[INFO] {gse} already exists in metadata. Skipping update.")

# 4. Write updated metadata back to file
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)
In [24]:
tab_num = 1
fig_num = 1
save_formats = ['png', 'svg', 'jpeg']
In [25]:
import archs4py as a4
#file_path = a4.download.counts("human", path="", version="latest") #comment out if the file is already downloaded
file = os.path.join("/home/ajy20/projects/8--auto-playbook-geo-reports", "human_gene_v2.latest.h5") 
In [26]:
metadata = a4.meta.series(file, gse)
In [27]:
%%capture
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
words_to_remove = ['experiement', 'tissue', 'type', 'batch', 'treatment', 'experiment', 'patient', 'batch', '1', '2', '3', '4', '5', '6', '7', '8', '9']
stopwords_plus = set(stopwords.words('english') + (words_to_remove))
pattern = r'[-,_.:]'
In [28]:
terms_to_remove = ["cell line", "cell type", "genotype", "treatment"]


pattern1 = r"\b(" + "|".join(map(re.escape, terms_to_remove)) + r")\b"

metadata["cleaned_characteristics"] = metadata["characteristics_ch1"].str.replace(
    pattern1, 
    "", 
    flags=re.IGNORECASE, 
    regex=True
).str.replace(r"\s+", " ", regex=True).str.strip()

metadata['cleaned_characteristics'] = metadata['cleaned_characteristics'].apply(lambda x: re.sub(pattern, " ", x).strip().lower())
metadata['cleaned_characteristics'] = metadata['cleaned_characteristics'].apply(
    lambda text: " ".join([word for word in text.split() if word not in stopwords_plus])
)
In [29]:
metadata['clean_title'] = metadata['title'].apply(lambda x: re.sub('[0-9]+', '', x))
metadata['clean_title'] = metadata['clean_title'].apply(lambda x: re.sub(pattern, " ", x).strip().lower())
metadata['clean_title'] = metadata['clean_title'].apply(
    lambda text: " ".join([word for word in text.split() if word not in stopwords_plus])
)

#metadata
In [30]:
groups = metadata.groupby(by='clean_title', level=None)
In [31]:
ctrl_words = set(['wt', 'wildtype', 'control', 'cntrl', 'ctrl', 'uninfected', 'normal', 'untreated', 'unstimulated', 'shctrl', 'ctl', 'healthy', 'sictrl', 'sicontrol', 'ctr', 'wild', 'dmso', 'vehicle', 'naive'])
In [32]:
groupings = {}
for label, group in groups:
    if len(group) not in {3, 4}: #enforce 3-4 samples per group
        raise ValueError("Study does not have 3-4 samples per group")
    
    groupings[label] = group['geo_accession'].tolist()

# print(groupings)
In [33]:
title_conditions = list(groupings.keys())
title_ctrl = []
for c in title_conditions:
    if len(set(c.split()).intersection(ctrl_words)) > 0:
        title_ctrl.append(c)
        
# print(title_conditions)
# print(title_ctrl)    
In [34]:
og_labels = {}
labled_groupings = {}

for label in groupings:
    samps = groupings[label]
    data = list(map(lambda s: s.lower(), metadata.loc[samps]['characteristics_ch1'].values))
    data_clean = []
    for d in data:
        data_clean.append(set(filter(lambda w: w not in stopwords_plus, re.sub(pattern, ' ', d).split())))
    condition = set(data_clean[0])
    for s in data_clean[1:]:
        condition.intersection_update(s)
    condition = ' '.join(list(condition))
    labled_groupings[condition] = samps
    og_labels[condition] = label

ch1_ctrl = []
ch1_conditions = list(labled_groupings.keys())

for condition in labled_groupings:
    split_conditions = condition.lower().split()
    if len(set(split_conditions).intersection(ctrl_words)) > 0:
        ch1_ctrl.append(condition)

# print(og_labels)
# print(ch1_conditions)
# print(ch1_ctrl)
In [35]:
#must have 1-2 controls. Must have perturbation groups as well (not all groups can be controls).
def check_eligibility(conditions, ctrl_conditions):
    if len(ctrl_conditions) not in range(1, 3) or len(ctrl_conditions) == len(conditions):
        return False
    else:
        return True
In [36]:
ch1_eligibility = check_eligibility(ch1_conditions, ch1_ctrl)
title_eligibility = check_eligibility(title_conditions, title_ctrl)
#print(ch1_eligibility)
#rint(title_eligibility)
In [37]:
def compare_groups(title_ctrl, ch1_ctrl, og_labels):
    #convert ch1 condition to corresponding title condition, check if their respective sample sets are equal
    for c in ch1_ctrl:
        ch1_corresponding = og_labels[c]
        if set(groupings[ch1_corresponding]) != set(labled_groupings[c]):
            return False

    return True
In [38]:
if ch1_eligibility and title_eligibility:
    if compare_groups(title_ctrl, ch1_ctrl, og_labels):
        ctrl_conditions = title_ctrl
        conditions = title_conditions
    else:
        raise Exception("Group Assignment Failed")
    
elif ch1_eligibility ^ title_eligibility:
    if ch1_eligibility:
        ctrl_conditions = ch1_ctrl
        conditions = ch1_conditions
        groupings = labled_groupings
    else:
        ctrl_conditions = title_ctrl
        conditions = title_conditions
else:
    raise Exception("Group Assignment Failed")

# print(ctrl_conditions)
# print(conditions)
In [39]:
%%capture
gene_matrix = a4.data.series(file, gse) #raw counts
gene_matrix.to_csv(os.path.join(resource_path, "raw_counts.csv"))
In [40]:
gene_matrix.head(5)
Out[40]:
GSM7884755 GSM7884756 GSM7884757 GSM7884758 GSM7884759 GSM7884760 GSM7884761 GSM7884762 GSM7884763 GSM7884764 GSM7884765 GSM7884766
TSPAN6 0 0 0 0 0 2 1 0 0 0 0 2
TNMD 0 0 0 0 0 0 0 0 0 0 0 0
DPM1 1475 1322 1197 1580 1236 1349 1090 899 1048 1126 1334 1148
SCYL3 417 291 281 357 288 300 385 296 349 264 241 260
C1ORF112 368 304 305 524 310 390 374 255 270 142 176 128
In [41]:
display(Markdown(f"**table {tab_num}**: This is a preview of the first 5 rows of the raw RNA-seq expression matrix from {gse}."))
tab_num +=1
display(FileLink(os.path.join(resource_path, "raw_counts.csv"), result_html_prefix="Download raw counts: "))

table 1: This is a preview of the first 5 rows of the raw RNA-seq expression matrix from GSE247175.

Download raw counts: /home/ajy20/geo2reports/output/GSE247175/raw_counts.csv
In [42]:
# Remove genes with all-zero counts
filtered_matrix = gene_matrix.loc[gene_matrix.sum(axis=1) > 0, :]

# Then filter out low average expression
filtered_matrix = filtered_matrix.loc[filtered_matrix.mean(axis=1) >= 3, :]
In [43]:
from maayanlab_bioinformatics.normalization.log import log2_normalize
from maayanlab_bioinformatics.normalization.zscore import zscore_normalize 
from maayanlab_bioinformatics.normalization.quantile_legacy import quantile_normalize

def normalize(gene_counts):
    norm_exp = quantile_normalize(gene_counts)
    norm_exp = log2_normalize(norm_exp)
    norm_exp = zscore_normalize(norm_exp)
    return norm_exp
In [44]:
%%capture
# from python_scripts.matrix import normalize
norm_matrix = normalize(filtered_matrix) #normalize the matrix for dim reduction and clustergram
In [45]:
def annotate_matrix(expr_df, groupings, ctrl_conditions):
    sampdict = {}
    for group in groupings.keys():
        samps = groupings[group]
        for samp in samps:
            if group in ctrl_conditions:
                sampdict[samp] = "control"
            else:
                sampdict[samp] = "perturbation"
    
    annotat = pd.DataFrame.from_dict(sampdict, orient='index', columns=['group'])
    anndict = {
        'count': expr_df,
        'annotations': annotat
    }
    return anndict
In [46]:
annotated_norm_matrix = annotate_matrix(norm_matrix, groupings, ctrl_conditions)

#print(annotated_norm_matrix['annotations'])
In [47]:
annotated_matrix = annotate_matrix(filtered_matrix.astype('int64'), groupings, ctrl_conditions) #filtered but not normalized

Results¶

In [48]:
save_html = True #if true, render plotly graphs as html and embed with ipython. else, use fig.show()
use_fig_plot = False #if true, render matplotlib graphs using show(), else it will render the saved pngs.

Dimensionality Reduction¶

UMAP¶

In [49]:
import python_scripts.visualizations as vis

vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="umap", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "umap.html")))
display(Image(os.path.join(resource_path, "umap.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot of a UMAP decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1

for fmt in save_formats:
    display(FileLink(os.path.join(resource_path, f"umap.{fmt}"), result_html_prefix=f"Download UMAP figure as {fmt}: "))
No description has been provided for this image

Figure 1: This figure displays a 2D scatter plot of a UMAP decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.

Download UMAP figure as png: /home/ajy20/geo2reports/output/GSE247175/umap.png
Download UMAP figure as svg: /home/ajy20/geo2reports/output/GSE247175/umap.svg
Download UMAP figure as jpeg: /home/ajy20/geo2reports/output/GSE247175/umap.jpeg

PCA¶

In [50]:
vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="pca", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "pca.html")))
display(Image(os.path.join(resource_path, "pca.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot of a PCA decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1

for fmt in save_formats:
    display(FileLink(os.path.join(resource_path, f"pca.{fmt}"), result_html_prefix=f"Download PCA figure as {fmt}: "))
No description has been provided for this image

Figure 2: This figure displays a 2D scatter plot of a PCA decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.

Download PCA figure as png: /home/ajy20/geo2reports/output/GSE247175/pca.png
Download PCA figure as svg: /home/ajy20/geo2reports/output/GSE247175/pca.svg
Download PCA figure as jpeg: /home/ajy20/geo2reports/output/GSE247175/pca.jpeg

t-SNE¶

In [51]:
vis.plot(annotated_norm_matrix['count'], annotated_norm_matrix['annotations'], n_components=2, save_formats=save_formats, decomp="tsne", save_html=save_html, save_path=resource_path)
#if save_html: display(HTML(os.path.join(resource_path, "tsne.html")))
display(Image(os.path.join(resource_path, "tsne.png"), width=700))
display(Markdown(f"**Figure {fig_num}**: This figure displays a 2D scatter plot using a t-SNE decomposition of the sample data. Each point represents an individual sample, colored by its experimental group."))
fig_num+=1

for fmt in save_formats:
    display(FileLink(os.path.join(resource_path, f"tsne.{fmt}"), result_html_prefix=f"Download t-SNE figure as {fmt}: "))
No description has been provided for this image

Figure 3: This figure displays a 2D scatter plot using a t-SNE decomposition of the sample data. Each point represents an individual sample, colored by its experimental group.

Download t-SNE figure as png: /home/ajy20/geo2reports/output/GSE247175/tsne.png
Download t-SNE figure as svg: /home/ajy20/geo2reports/output/GSE247175/tsne.svg
Download t-SNE figure as jpeg: /home/ajy20/geo2reports/output/GSE247175/tsne.jpeg

Clustergram Heatmaps¶

In [52]:
from maayanlab_bioinformatics.normalization.filter import filter_by_var
norm_t1000 = annotated_norm_matrix['count'].copy()
norm_t1000 = filter_by_var(annotated_norm_matrix['count'], top_n=1000, axis=1)
norm_t1000.columns=metadata['title'].tolist()
In [53]:
t1000_path = os.path.join(resource_path, 'expression_matrix_top1000_genes.txt')
norm_t1000.to_csv(t1000_path, sep='\t')
In [54]:
import requests, json
clustergrammer_url = 'http://amp.pharm.mssm.edu/clustergrammer/matrix_upload/'

r = requests.post(clustergrammer_url, files={'file': open(t1000_path, 'rb')})
link = r.text
In [55]:
from IPython.display import IFrame
display(IFrame(link, width="600", height="650"))
display(Markdown(f"**Figure {fig_num}**: The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment."))
fig_num+=1

Figure 4: The figure contains an interactive heatmap displaying gene expression for each sample in the RNA-seq dataset. Every row of the heatmap represents a gene, every column represents a sample, and every cell displays normalized gene expression values. The heatmap additionally features color bars beside each column which represent prior knowledge of each sample, such as the tissue of origin or experimental treatment.

In [56]:
from python_scripts import visualizations as vis
clustergram = vis.plot_clustergram(norm_t1000)
In [57]:
for fmt in save_formats:
    file_name = os.path.join(resource_path, f"clustergram.{fmt}")
    clustergram.write_image(file_name, width=700, height=700)
In [58]:
# if save_html:
#     cluster_path = os.path.join(resource_path, "clustergram.html")
#     clustergram.write_html(cluster_path)
#     display(HTML(cluster_path))
# else:
#     clustergram.show()

display(Image(os.path.join(resource_path, "clustergram.png"), width=700))
No description has been provided for this image
In [59]:
display(Markdown(f"**Figure {fig_num}**: this figure is a clustergram produced with the graphing library Plotly. It sacrifices some interactivity for a more polished look."))
fig_num+=1

for fmt in save_formats:
    file_name = os.path.join(resource_path, f"clustergram.{fmt}")
    #clustergram.write_image(file_name, width=600, height=600)
    display(FileLink(file_name, result_html_prefix=f"Download as {fmt}: "))

Figure 5: this figure is a clustergram produced with the graphing library Plotly. It sacrifices some interactivity for a more polished look.

Download as png: /home/ajy20/geo2reports/output/GSE247175/clustergram.png
Download as svg: /home/ajy20/geo2reports/output/GSE247175/clustergram.svg
Download as jpeg: /home/ajy20/geo2reports/output/GSE247175/clustergram.jpeg

Differentially Expressed Genes Calculation and Volcano Plots¶

In [60]:
from maayanlab_bioinformatics.dge.limma_voom import limma_voom_differential_expression

sig_names = []
matrix = annotated_matrix['count']
dges = {}

seen = []
for condition in ctrl_conditions:
    for condition2 in conditions:
        if condition!=condition2 and {condition, condition2} not in seen:
            seen.append({condition, condition2})
            
            sig_name = f'{condition}-vs-{condition2}.tsv'
            sig_names.append(sig_name)
            try:
                        with suppress_output():
                            dge = limma_voom_differential_expression(
                                matrix[groupings[condition]],
                                matrix[groupings[condition2]],
                                voom_design=True,
                            )
                        if not dge.empty:
                            dge['logFC'] = dge['logFC'].round(2)
                            dge['AveExpr'] = dge['AveExpr'].round(2)
                            dge['t'] = dge['t'].round(2)
                            dge['B'] = dge['B'].round(2)
                            dges[sig_name] = dge
                            dge.to_csv(os.path.join(resource_path, sig_name), sep='\t')
                        else:
                            print('Empty dge returned for', sig_name)
            except Exception as e:
                print(e)
                print('Error computing:', sig_name)
In [61]:
for sig_name in sig_names:
    dge_path = os.path.join(resource_path, sig_name)
    # table = pd.read_csv(dge_path, sep="\t")

    table = dges[sig_name]
    display(table.head(5))
    display(Markdown(f"**Table {tab_num}**: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom."))
    display(FileLink(dge_path, result_html_prefix="Download DGE table: "))
    tab_num+=1
logFC AveExpr t P.Value adj.P.Val B
gene_symbol
SRGN 2.14 9.83 105.66 3.328260e-20 6.334484e-16 36.70
CHI3L1 3.15 9.74 101.23 5.753391e-20 6.334484e-16 36.07
FTH1 2.63 10.86 76.01 2.238429e-18 1.504113e-14 32.71
HSPA8 1.85 9.18 74.83 2.732266e-18 1.504113e-14 32.46
FTL 1.46 11.82 72.27 4.263324e-18 1.877568e-14 31.81

Table 2: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.

Download DGE table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo.tsv
logFC AveExpr t P.Value adj.P.Val B
gene_symbol
HSPA8 1.35 8.98 32.01 3.204121e-13 7.055474e-09 20.70
SCD 1.21 8.72 26.74 2.851548e-12 3.139554e-08 18.65
FADS2 1.32 7.59 18.41 2.516819e-10 2.792420e-07 14.26
SQLE 1.05 6.75 18.39 2.553503e-10 2.792420e-07 14.22
INSIG1 1.36 6.41 18.11 3.059481e-10 2.929121e-07 14.02

Table 3: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.

Download DGE table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha.tsv
logFC AveExpr t P.Value adj.P.Val B
gene_symbol
TFRC 0.99 8.48 28.64 2.873655e-12 1.581947e-08 18.67
TUBB4B 0.93 7.84 24.34 1.898124e-11 4.179669e-08 16.81
LPCAT1 0.88 7.85 24.04 2.193322e-11 4.390631e-08 16.67
SFPQ 0.62 9.47 20.80 1.164689e-10 1.349813e-07 15.02
MT-CO1 0.41 13.45 20.34 1.509704e-10 1.662185e-07 14.07

Table 4: This is a preview of the first 5 rows of the differentially expressed gene table calculated by Limma Voom.

Download DGE table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven.tsv
In [62]:
threshold=1.0
In [63]:
upreg = {}
downreg = {}

upreg_t500 = {}
downreg_t500 = {}

for sig_name in sig_names:
    #dge = pd.read_csv(os.path.join(resource_path, sig_name), sep="\t").set_index("gene_symbol")
    dge = dges[sig_name]

    sig_name = sig_name.replace(".tsv", "")

    up_genes = dge.loc[(dge['P.Value']<0.05) & (dge['logFC']>threshold), :].index.tolist() 
    down_genes = dge.loc[(dge['P.Value']<0.05) & (dge['logFC']<-threshold), :].index.tolist()

    upreg[sig_name] = up_genes
    downreg[sig_name] = down_genes

    up_genes_t500 = dge.loc[(dge['P.Value']<0.05)].sort_values(by="logFC", ascending=False)[:500].index.tolist()
    down_genes_t500 = dge.loc[(dge['P.Value']<0.05)].sort_values(by="logFC")[:500].index.tolist()

    upreg_t500[sig_name] = up_genes_t500
    downreg_t500[sig_name] = down_genes_t500

    save_name = f"{sig_name}_volcano"

    display(Markdown(f"**{sig_name}**"))
    vis.plot_volcano(dge, threshold=threshold, save_formats=save_formats, save_name = save_name, save_html=save_html, save_path=resource_path)
    #if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
    display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
    display(Markdown(f"**Figure {fig_num}**: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison {sig_name}. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes."))
    fig_num+=1

    for fmt in save_formats:
        file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
        display(FileLink(file_name, result_html_prefix=f"Download volcano plot as {fmt}: "))

dmso-vs-combo

No description has been provided for this image

Figure 6: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-combo. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.

Download volcano plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_volcano.png
Download volcano plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_volcano.svg
Download volcano plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_volcano.jpeg

dmso-vs-ha

No description has been provided for this image

Figure 7: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-ha. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.

Download volcano plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_volcano.png
Download volcano plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_volcano.svg
Download volcano plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_volcano.jpeg

dmso-vs-ven

No description has been provided for this image

Figure 8: The figure contains an interactive scatter plot which displays the log2-fold changes and statistical significance of each gene calculated by performing a differential gene expression analysis for the comparison dmso-vs-ven. Every point in the plot represents a gene. Red points indicate significantly up-regulated genes, blue points indicate down-regulated genes.

Download volcano plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_volcano.png
Download volcano plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_volcano.svg
Download volcano plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_volcano.jpeg

Enrichr: Enrichment Analysis¶

In [64]:
sig_names_clean = [name.replace('.tsv', '') for name in sig_names]
In [65]:
from python_scripts.enrichment import Enrichr_API, enrichr_figure

enrichr_libraries = ["ChEA_2022", "ARCHS4_TFs_Coexp", "Reactome_Pathways_2024", "MGI_Mammalian_Phenotype_Level_4_2024", "GO_Biological_Process_2025", "GWAS_Catalog_2023"]
if species == "human":
    enrichr_libraries.extend(["WikiPathways_2024_Human", "KEGG_2021_Human"])
elif species == "mouse":
    enrichr_libraries.extend(["WikiPathways_2024_Mouse", "KEGG_2019_Mouse"])
else:
    raise Exception("Species not supported.")

enrichr_libraries.sort()

figure_file_format = save_formats

color = "tomato"

Upregulated Set¶

In [66]:
from IPython.display import Image
#upregulated results
for sig_name in sig_names_clean:
    up_file_name = sig_name + ' up_enrichr_results'
    final_output_file_names_up = [str(os.path.join(resource_path, up_file_name+'.'+file_type)) for file_type in figure_file_format]
    uresults = Enrichr_API(upreg[sig_name], enrichr_libraries)
    display(Markdown(f"#### **{sig_name}**"))
    enrichr_figure(uresults[0],uresults[1],uresults[2],final_output_file_names_up, uresults[4],figure_file_format, color, show_plot=False)
    display(Image(final_output_file_names_up[0], width=600)) #display the PNG
    display(Markdown(f'**Figure {fig_num}**: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: ' + str('https://amp.pharm.mssm.edu/Enrichr/enrich?dataset='+ uresults[3])))
    fig_num+=1

    for name in final_output_file_names_up: 
        display(FileLink(name, result_html_prefix=f"Download figure as {name[name.rfind('.')+1:]}:"))

dmso-vs-combo¶

No description has been provided for this image

Figure 9: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=f46b595ce1c8abb5b5f47c6e078592d4

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo up_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo up_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo up_enrichr_results.jpeg

dmso-vs-ha¶

No description has been provided for this image

Figure 10: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=9e29b635bff0f028bf85ef8e41c65a83

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha up_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha up_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha up_enrichr_results.jpeg

dmso-vs-ven¶

No description has been provided for this image

Figure 11: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven up_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven up_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven up_enrichr_results.jpeg

Downregulated Set¶

In [67]:
#downregulated results
for sig_name in sig_names_clean:
    dn_file_name = sig_name + ' dn_enrichr_results'
    final_output_file_names_dn = [str(os.path.join(resource_path, dn_file_name+'.'+file_type)) for file_type in figure_file_format]
    dresults = Enrichr_API(downreg[sig_name], enrichr_libraries)
    display(Markdown(f"#### **{sig_name}**"))
    enrichr_figure(dresults[0],dresults[1],dresults[2],final_output_file_names_dn, dresults[4],figure_file_format, color, show_plot=False)
    display(Image(final_output_file_names_dn[0], width=600)) #display the PNG
    display(Markdown(f'**Figure {fig_num}**: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: ' + str('https://amp.pharm.mssm.edu/Enrichr/enrich?dataset='+ uresults[3])))
    fig_num+=1

    for name in final_output_file_names_dn: 
        display(FileLink(name, result_html_prefix=f"Download figure as {name[name.rfind('.')+1:]}:"))

dmso-vs-combo¶

No description has been provided for this image

Figure 12: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo dn_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo dn_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo dn_enrichr_results.jpeg

dmso-vs-ha¶

No description has been provided for this image

Figure 13: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha dn_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha dn_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha dn_enrichr_results.jpeg

dmso-vs-ven¶

No description has been provided for this image

Figure 14: This figure contains several barplots depicting enrichment analysis results on the upregulated gene set. Each barplot corresponds to an individual library from Enrichr, and the top matching terms by p-value are depicted in each. Statistically significant terms are represented as red bars while others are represented as gray. Access your Enrichment results here: https://amp.pharm.mssm.edu/Enrichr/enrich?dataset=372778e4f84d2a69f1b68d91c1f925a1

Download figure as png:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven dn_enrichr_results.png
Download figure as svg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven dn_enrichr_results.svg
Download figure as jpeg:/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven dn_enrichr_results.jpeg

CHEA3: Transcription Factor Enrichment Analysis¶

Upregulated Set¶

In [68]:
import python_scripts.chea3 as chea

#TFs of upregulated genes
for sig_name in sig_names_clean:
    save_name = sig_name + 'upchea'
    up_results = chea.get_chea3_results(upreg[sig_name], 'query')
    chea.mean_rank_bar(up_results, save_name=save_name, save_formats=save_formats, save_html=save_html, save_path=resource_path)
    
    display(Markdown(f"#### **{sig_name}**"))
    #if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
    display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
    display(Markdown(f"**Figure {fig_num}**: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries."))
    fig_num+=1

    for fmt in save_formats:
        file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
        display(FileLink(file_name, result_html_prefix=f"Download bar plot as {fmt}: "))

dmso-vs-combo¶

No description has been provided for this image

Figure 15: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-comboupchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-comboupchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-comboupchea.jpeg

dmso-vs-ha¶

No description has been provided for this image

Figure 16: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-haupchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-haupchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-haupchea.jpeg

dmso-vs-ven¶

No description has been provided for this image

Figure 17: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-venupchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-venupchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-venupchea.jpeg

Downregulated Set¶

In [69]:
for sig_name in sig_names_clean:
    save_name = sig_name + 'dnchea'
    dn_results = chea.get_chea3_results(downreg[sig_name], 'query')
    chea.mean_rank_bar(dn_results, save_name=save_name, save_formats=save_formats, save_html=save_html, save_path=resource_path)
    
    display(Markdown(f"#### **{sig_name}**"))
    #if save_html: display(HTML(os.path.join(resource_path, f"{save_name}.html")))
    display(Image(os.path.join(resource_path, f"{save_name}.png"), width=700))
    display(Markdown(f"**Figure {fig_num}**: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries."))
    fig_num+=1

    for fmt in save_formats:
        file_name=os.path.join(resource_path, f"{save_name}.{fmt}")
        display(FileLink(file_name, result_html_prefix=f"Download bar plot as {fmt}: "))

dmso-vs-combo¶

No description has been provided for this image

Figure 18: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combodnchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combodnchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combodnchea.jpeg

dmso-vs-ha¶

No description has been provided for this image

Figure 19: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-hadnchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-hadnchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-hadnchea.jpeg

dmso-vs-ven¶

No description has been provided for this image

Figure 20: Horizontal bar chart, y-axis represents transcription factors. Displays the top ranked transcription factors for the upregulated set according to their average integrated scores across all the libraries.

Download bar plot as png: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-vendnchea.png
Download bar plot as svg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-vendnchea.svg
Download bar plot as jpeg: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-vendnchea.jpeg

L2S2 and DRUG-seqr: Reverser and Mimicker Drugs¶

Reverser Results¶

In [70]:
from python_scripts.druganalysis import druganalysis

dbs = ['l2s2_fda', 'l2s2_all', 'drugseqr_fda', 'drugseqr_all']

for sig_name in sig_names_clean:
    # up_genes_t500 = upreg_t500[sig_name]
    # down_genes_t500 = downreg_t500[sig_name]
    up_genes = upreg[sig_name]
    down_genes = upreg[sig_name]
    
    rev_drugs = druganalysis(geneset=up_genes_t500, geneset_dn=down_genes_t500, direction="reversers", save_path=resource_path, save_name=sig_name, tab_num=tab_num, fig_num=fig_num)
    display(Markdown(f"#### **{sig_name}**"))
    
    for db in dbs:
        display(Markdown(f"**{db}**"))
        try:
            rev_drugs.display_table(db=db)
            rev_drugs.display_barplot(db=db, save_path=resource_path, save_formats=save_formats)
        except ValueError as e:
            print("Caught error: ", e)

    fig_num = rev_drugs.fig_num
    tab_num  = rev_drugs.tab_num

    display(Markdown("---"))
    

dmso-vs-combo¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueReverse adjPvalueReverse oddsRatioReverse reverserOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 8.99e-01 1 7.01e-01 8 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.00e+00 1 0.00e+00 0 False 712

Table 5: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_reversers_l2s2_all.tsv
No description has been provided for this image

Figure 21: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_reversers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_reversers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_reversers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

dmso-vs-ha¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueReverse adjPvalueReverse oddsRatioReverse reverserOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 8.99e-01 1 7.01e-01 8 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.00e+00 1 0.00e+00 0 False 712

Table 6: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_reversers_l2s2_all.tsv
No description has been provided for this image

Figure 22: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_reversers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_reversers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_reversers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

dmso-vs-ven¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueReverse adjPvalueReverse oddsRatioReverse reverserOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 8.99e-01 1 7.01e-01 8 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.00e+00 1 0.00e+00 0 False 712

Table 7: Ranked LINCS L1000 signatures predicted to reverse the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_reversers_l2s2_all.tsv
No description has been provided for this image

Figure 23: barplot representation depicting the -log10p values of the top l2s2_all reversers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_reversers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_reversers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_reversers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

Mimicker Results¶

In [71]:
for sig_name in sig_names_clean:
    # up_genes_t500 = upreg_t500[sig_name]
    # down_genes_t500 = downreg_t500[sig_name]

    up_genes = upreg[sig_name]
    down_genes = upreg[sig_name]

    mim_drugs = druganalysis(geneset=up_genes_t500, geneset_dn=down_genes_t500, direction="mimickers", save_path=resource_path, save_name=sig_name, tab_num=tab_num, fig_num=fig_num)
    display(Markdown(f"#### **{sig_name}**"))
    
    for db in dbs:
        display(Markdown(f"\n**{db}**"))
        try:
            mim_drugs.display_table(db=db)
            mim_drugs.display_barplot(db=db, save_path=resource_path, save_formats=save_formats)
        except ValueError as e:
            print("Caught error: ", e)

    fig_num = mim_drugs.fig_num
    tab_num  = mim_drugs.tab_num

    display(Markdown("---"))
    

dmso-vs-combo¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueMimic adjPvalueMimic oddsRatioMimic mimickerOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 3.77e-09 6.33e-03 3.24e+00 35 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.34e-08 1.13e-02 3.14e+00 34 False 712

Table 8: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_mimickers_l2s2_all.tsv
No description has been provided for this image

Figure 24: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_mimickers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_mimickers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-combo_mimickers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

dmso-vs-ha¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueMimic adjPvalueMimic oddsRatioMimic mimickerOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 3.77e-09 6.33e-03 3.24e+00 35 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.34e-08 1.13e-02 3.14e+00 34 False 712

Table 9: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_mimickers_l2s2_all.tsv
No description has been provided for this image

Figure 25: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_mimickers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_mimickers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ha_mimickers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

dmso-vs-ven¶

l2s2_fda

Caught error:  No Results for l2s2_fda

l2s2_all

perturbation term pvalueMimic adjPvalueMimic oddsRatioMimic mimickerOverlap approved count
0 SL-0101-1 AICHI002_K562_4H_D11_SL-0101-1_0.04uM up 3.77e-09 6.33e-03 3.24e+00 35 False 192
1 GW-5074 AICHI001_THP1_4H_N14_GW-5074_2.5uM up 1.34e-08 1.13e-02 3.14e+00 34 False 712

Table 10: Ranked LINCS L1000 signatures predicted to mimic the uploaded geneset.

View in L2S2
Download table: /home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_mimickers_l2s2_all.tsv
No description has been provided for this image

Figure 26: barplot representation depicting the -log10p values of the top l2s2_all mimickers. Red bars represent statistically significant results; otherwise gray.

Download bar plot as png/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_mimickers_l2s2_all.png
Download bar plot as svg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_mimickers_l2s2_all.svg
Download bar plot as jpeg/home/ajy20/geo2reports/output/GSE247175/dmso-vs-ven_mimickers_l2s2_all.jpeg

drugseqr_fda

Caught error:  No Results for drugseqr_fda

drugseqr_all

Caught error:  No Results for drugseqr_all

References¶

In [72]:
references = f'''
[1] {citation}

[2] McInnes L, Healy J, Saul N, Großberger L. UMAP: Uniform manifold approximation and projection. Journal of Open Source Software. 2018;3(29):861. doi:10.21105/joss.00861

[3] Clark NR, Ma’ayan A. Introduction to statistical methods to analyze large data sets: Principal Components Analysis. Science Signaling. 2011;4(190):tr3-tr3. doi:10.1126/scisignal.2001967 

[4] van der Maaten L, Hinton G. Visualizing Data using t-SNE. Journal of Machine Learning Research. 2008;9(86):2579-2605.

[5] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013;128(14)

[6] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377.

[7] Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: 10.1002/cpz1.90

[8] Keenan AB, Torre D, Lachmann A, Leong AK, Wojciechowicz M, Utti V, Jagodnik K, Kropiwnicki E, Wang Z, Ma'ayan A (2019) ChEA3: transcription factor enrichment analysis by orthogonal omics integration. Nucleic Acids Research. doi: 10.1093/nar/gkz446

[9] Marino GB, Evangelista JE, Clarke DJB, Ma’ayan A. L2S2: chemical perturbation and CRISPR KO LINCS L1000 signature search engine. Nucleic Acids Res. 2025; gkaf373. doi:10.1093/nar/gkaf373

[10] Li J, Ho DJ, Henault M, et al. DRUG-seq Provides Unbiased Biological Activity Readouts for Neuroscience Drug Discovery. ACS Chem Biol. 2022;17(6):1401-1414. doi:10.1021/acschembio.1c00920

[11] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi: 10.1038/s41467-018-03751-6.

[12] Bray, N., Pimentel, H., Melsted, P. et al. Near-optimal probabilistic RNA-seq quantification. Nat Biotechnol 34, 525–527 (2016). https://doi.org/10.1038/nbt.3519

[13] Fernandez, N. F. et al. Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data. Sci. Data 4:170151 doi: 10.1038/sdata.2017.151 (2017).

[14] Ritchie ME, Phipson B, Wu D, Hu Y, Law CW, Shi W, Smyth GK. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015 Apr 20;43(7):e47. doi: 10.1093/nar/gkv007.

[15] Milacic M, Beavers D, Conley P, Gong C, Gillespie M, Griss J, Haw R, Jassal B, Matthews L, May B, Petryszak R, Ragueneau E, Rothfels K, Sevilla C, Shamovsky V, Stephan R, Tiwari K, Varusai T, Weiser J, Wright A, Wu G, Stein L, Hermjakob H, D’Eustachio P. The Reactome Pathway Knowledgebase 2024. Nucleic Acids Research. 2024. doi: 10.1093/nar/gkad1025.

[16] Eppig JT, Smith CL, Blake JA, Ringwald M, Kadin JA, Richardson JE, Bult CJ. Mouse Genome Informatics (MGI): Resources for Mining Mouse Genetic, Genomic, and Biological Data in Support of Primary and Translational Research. Methods Mol Biol. 2017;1488:47-73. doi: 10.1007/978-1-4939-6427-7_3.

[17] Ashburner M, Ball CA, Blake JA, Botstein D, Butler H, Cherry JM, Davis AP, Dolinski K, Dwight SS, Eppig JT, Harris MA, Hill DP, Issel-Tarver L, Kasarskis A, Lewis S, Matese JC, Richardson JE, Ringwald M, Rubin GM, Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000 May;25(1):25-9. doi: 10.1038/75556.

[18] Cerezo M, Sollis E, Ji Y, et al. The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity. Nucleic Acids Res. 2025;53(D1):D998-D1005. doi:10.1093/nar/gkae1070

[19] Kanehisa M, Furumichi M, Sato Y, Matsuura Y, Ishiguro-Watanabe M. KEGG: biological systems database as a model of the real world. Nucleic Acids Res. 2025;53(D1):D672-D677. doi:10.1093/nar/gkae909

[20] Kanehisa M, Goto S. KEGG: kyoto encyclopedia of genes and genomes. Nucleic Acids Res. 2000;28(1):27-30. doi:10.1093/nar/28.1.27

[21] Kanehisa M. Toward understanding the origin and evolution of cellular organisms. Protein Sci. 2019;28(11):1947-1951. doi:10.1002/pro.3715

[22] Pico AR, Kelder T, van Iersel MP, Hanspers K, Conklin BR, Evelo C. WikiPathways: pathway editing for the people. PLoS Biol. 2008 Jul 22;6(7):e184. doi: 10.1371/journal.pbio.0060184.

[23] GTEx Consortium. The Genotype-Tissue Expression (GTEx) project. Nat Genet. 2013 Jun;45(6):580-5. doi: 10.1038/ng.2653.

[24] ENCODE Project Consortium. An integrated encyclopedia of DNA elements in the human genome. Nature. 2012;489(7414):57-74. doi:10.1038/nature11247

[25] Luo Y, Hitz BC, Gabdank I, et al. New developments on the Encyclopedia of DNA Elements (ENCODE) data portal. Nucleic Acids Res. 2020;48(D1):D882-D889. doi:10.1093/nar/gkz1062

[26] Hammal F, de Langen P, Bergon A, Lopez F, Ballester B. ReMap 2022: a database of Human, Mouse, Drosophila and Arabidopsis regulatory regions from an integrative analysis of DNA-binding sequencing experiments. Nucleic Acids Res. 2022;50(D1):D316-D325. doi:10.1093/nar/gkab996

'''
In [73]:
display(Markdown(references))

[1] Jiang, X., Huang, K., Sun, X., Li, Y., Hua, L., Liu, F., Huang, R., Du, J., & Zeng, H. (2024). Hexamethylene amiloride synergizes with venetoclax to induce lysosome-dependent cell death in acute myeloid leukemia.. iScience, 27(1), 108691.

[2] McInnes L, Healy J, Saul N, Großberger L. UMAP: Uniform manifold approximation and projection. Journal of Open Source Software. 2018;3(29):861. doi:10.21105/joss.00861

[3] Clark NR, Ma’ayan A. Introduction to statistical methods to analyze large data sets: Principal Components Analysis. Science Signaling. 2011;4(190):tr3-tr3. doi:10.1126/scisignal.2001967

[4] van der Maaten L, Hinton G. Visualizing Data using t-SNE. Journal of Machine Learning Research. 2008;9(86):2579-2605.

[5] Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013;128(14)

[6] Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377.

[7] Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: 10.1002/cpz1.90

[8] Keenan AB, Torre D, Lachmann A, Leong AK, Wojciechowicz M, Utti V, Jagodnik K, Kropiwnicki E, Wang Z, Ma'ayan A (2019) ChEA3: transcription factor enrichment analysis by orthogonal omics integration. Nucleic Acids Research. doi: 10.1093/nar/gkz446

[9] Marino GB, Evangelista JE, Clarke DJB, Ma’ayan A. L2S2: chemical perturbation and CRISPR KO LINCS L1000 signature search engine. Nucleic Acids Res. 2025; gkaf373. doi:10.1093/nar/gkaf373

[10] Li J, Ho DJ, Henault M, et al. DRUG-seq Provides Unbiased Biological Activity Readouts for Neuroscience Drug Discovery. ACS Chem Biol. 2022;17(6):1401-1414. doi:10.1021/acschembio.1c00920

[11] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi: 10.1038/s41467-018-03751-6.

[12] Bray, N., Pimentel, H., Melsted, P. et al. Near-optimal probabilistic RNA-seq quantification. Nat Biotechnol 34, 525–527 (2016). https://doi.org/10.1038/nbt.3519

[13] Fernandez, N. F. et al. Clustergrammer, a web-based heatmap visualization and analysis tool for high-dimensional biological data. Sci. Data 4:170151 doi: 10.1038/sdata.2017.151 (2017).

[14] Ritchie ME, Phipson B, Wu D, Hu Y, Law CW, Shi W, Smyth GK. limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Res. 2015 Apr 20;43(7):e47. doi: 10.1093/nar/gkv007.

[15] Milacic M, Beavers D, Conley P, Gong C, Gillespie M, Griss J, Haw R, Jassal B, Matthews L, May B, Petryszak R, Ragueneau E, Rothfels K, Sevilla C, Shamovsky V, Stephan R, Tiwari K, Varusai T, Weiser J, Wright A, Wu G, Stein L, Hermjakob H, D’Eustachio P. The Reactome Pathway Knowledgebase 2024. Nucleic Acids Research. 2024. doi: 10.1093/nar/gkad1025.

[16] Eppig JT, Smith CL, Blake JA, Ringwald M, Kadin JA, Richardson JE, Bult CJ. Mouse Genome Informatics (MGI): Resources for Mining Mouse Genetic, Genomic, and Biological Data in Support of Primary and Translational Research. Methods Mol Biol. 2017;1488:47-73. doi: 10.1007/978-1-4939-6427-7_3.

[17] Ashburner M, Ball CA, Blake JA, Botstein D, Butler H, Cherry JM, Davis AP, Dolinski K, Dwight SS, Eppig JT, Harris MA, Hill DP, Issel-Tarver L, Kasarskis A, Lewis S, Matese JC, Richardson JE, Ringwald M, Rubin GM, Sherlock G. Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. Nat Genet. 2000 May;25(1):25-9. doi: 10.1038/75556.

[18] Cerezo M, Sollis E, Ji Y, et al. The NHGRI-EBI GWAS Catalog: standards for reusability, sustainability and diversity. Nucleic Acids Res. 2025;53(D1):D998-D1005. doi:10.1093/nar/gkae1070

[19] Kanehisa M, Furumichi M, Sato Y, Matsuura Y, Ishiguro-Watanabe M. KEGG: biological systems database as a model of the real world. Nucleic Acids Res. 2025;53(D1):D672-D677. doi:10.1093/nar/gkae909

[20] Kanehisa M, Goto S. KEGG: kyoto encyclopedia of genes and genomes. Nucleic Acids Res. 2000;28(1):27-30. doi:10.1093/nar/28.1.27

[21] Kanehisa M. Toward understanding the origin and evolution of cellular organisms. Protein Sci. 2019;28(11):1947-1951. doi:10.1002/pro.3715

[22] Pico AR, Kelder T, van Iersel MP, Hanspers K, Conklin BR, Evelo C. WikiPathways: pathway editing for the people. PLoS Biol. 2008 Jul 22;6(7):e184. doi: 10.1371/journal.pbio.0060184.

[23] GTEx Consortium. The Genotype-Tissue Expression (GTEx) project. Nat Genet. 2013 Jun;45(6):580-5. doi: 10.1038/ng.2653.

[24] ENCODE Project Consortium. An integrated encyclopedia of DNA elements in the human genome. Nature. 2012;489(7414):57-74. doi:10.1038/nature11247

[25] Luo Y, Hitz BC, Gabdank I, et al. New developments on the Encyclopedia of DNA Elements (ENCODE) data portal. Nucleic Acids Res. 2020;48(D1):D882-D889. doi:10.1093/nar/gkz1062

[26] Hammal F, de Langen P, Bergon A, Lopez F, Ballester B. ReMap 2022: a database of Human, Mouse, Drosophila and Arabidopsis regulatory regions from an integrative analysis of DNA-binding sequencing experiments. Nucleic Acids Res. 2022;50(D1):D316-D325. doi:10.1093/nar/gkab996

In [74]:
# %%capture
# !jupyter nbconvert --to html --no-input "{gse}.ipynb"